Load key packages
data(nyc_airbnb)
Where are max and min airbnb prices What are avg prices? What factors affect prices? Where are prices going up and down over time? What’s the relationship bw number of reviews and avg review? Does room type affect availability? What’s the location of the unit? What areas are popular? Popularity = number of units? avg price? Are there repeat hosts? *If so, what does that mean?
Prices…
nyc_airbnb |>
summarize(min_price=min(price, na.rm=TRUE),
max_price=max(price, na.rm = TRUE))
## # A tibble: 1 × 2
## min_price max_price
## <dbl> <dbl>
## 1 10 10000
nyc_airbnb |>
summarize(mean(price, na.rm = TRUE))
## # A tibble: 1 × 1
## `mean(price, na.rm = TRUE)`
## <dbl>
## 1 145.
distrib of prices in 5 boroughs
nyc_airbnb %>%
ggplot(aes(x = price, fill = neighbourhood_group)) +
geom_histogram() +
facet_grid(. ~ neighbourhood_group) +
scale_x_continuous(limits = c(0, 250))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3810 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).
does room type affect availability?
ggplot(data = nyc_airbnb, aes(x = room_type, y = availability_365)) +
geom_violin()
mean price for entire home vs private room vs shared room
nyc_airbnb |>
group_by(room_type) |>
summarize(mean_price = mean(price))
## # A tibble: 3 × 2
## room_type mean_price
## <chr> <dbl>
## 1 Entire home/apt 207.
## 2 Private room 87.5
## 3 Shared room 70.2
Comparing number of reviews vs. average review score
ggplot(data=nyc_airbnb, aes(x=number_of_reviews, y=review_scores_location, na.rm=TRUE)) + geom_point()
## Warning: Removed 10037 rows containing missing values or values outside the scale range
## (`geom_point()`).
Repeat hosts
nyc_airbnb %>%
filter(calculated_host_listings_count > 1) %>%
distinct(host_id, host_name, calculated_host_listings_count) |>
arrange(desc(calculated_host_listings_count))
## # A tibble: 3,944 × 3
## host_id host_name calculated_host_listings_count
## <dbl> <chr> <dbl>
## 1 26377263 Stat 35
## 2 51913826 The Bowery House 33
## 3 417504 Dana 28
## 4 81634538 Rio 20
## 5 8874674 Laura 18
## 6 440022 Petter 18
## 7 53213930 West 42nd Street 16
## 8 47554473 Mae 15
## 9 31307789 Luffy 14
## 10 96098402 Carrie 13
## # ℹ 3,934 more rows
nyc_airbnb %>%
filter(calculated_host_listings_count > 34)
## # A tibble: 35 × 17
## id review_scores_location name host_id host_name neighbourhood_group
## <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 15057686 NA Home 4… 2.64e7 Stat Brooklyn
## 2 15080936 NA Home 4… 2.64e7 Stat Brooklyn
## 3 14776203 10 Home 4… 2.64e7 Stat Brooklyn
## 4 15074005 NA Home 4… 2.64e7 Stat Brooklyn
## 5 5866656 NA Home 4… 2.64e7 Stat Brooklyn
## 6 5538353 NA Home 4… 2.64e7 Stat Brooklyn
## 7 5632551 10 Home 4… 2.64e7 Stat Brooklyn
## 8 7788565 NA Home 4… 2.64e7 Stat Brooklyn
## 9 7789408 9 Home4 … 2.64e7 Stat Brooklyn
## 10 7789213 10 Home 4… 2.64e7 Stat Brooklyn
## # ℹ 25 more rows
## # ℹ 11 more variables: neighbourhood <chr>, lat <dbl>, long <dbl>,
## # room_type <chr>, price <dbl>, minimum_nights <dbl>,
## # number_of_reviews <dbl>, last_review <date>, reviews_per_month <dbl>,
## # calculated_host_listings_count <dbl>, availability_365 <dbl>
nyc_airbnb %>%
group_by(neighbourhood_group) %>%
summarise(total_reviews = sum(number_of_reviews, na.rm = TRUE)) %>%
arrange(desc(total_reviews))
## # A tibble: 5 × 2
## neighbourhood_group total_reviews
## <chr> <dbl>
## 1 Manhattan 323941
## 2 Brooklyn 263542
## 3 Queens 66611
## 4 Bronx 9897
## 5 Staten Island 4744
nyc_airbnb |>
group_by(neighbourhood_group, room_type) |>
summarize(median_price=median(price)) |>
pivot_wider(
names_from= room_type,
values_from=median_price
)
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
## # A tibble: 5 × 4
## # Groups: neighbourhood_group [5]
## neighbourhood_group `Entire home/apt` `Private room` `Shared room`
## <chr> <dbl> <dbl> <dbl>
## 1 Bronx 100 55 43
## 2 Brooklyn 145 65 40
## 3 Manhattan 190 90 65
## 4 Queens 119 60 39
## 5 Staten Island 112. 55 25
nyc_airbnb |>
ggplot(aes(x=review_scores_location, y=price)) +
geom_point()
## Warning: Removed 10037 rows containing missing values or values outside the scale range
## (`geom_point()`).
looking at locations in Manhattan
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000) |>
ggplot(aes(x=lat, y=long, color=price)) +
geom_point(alpha=.1)
descending mean price by neighborhood
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000,
room_type == "Entire home/apt") |>
group_by(neighbourhood) |>
summarize(mean_price=mean(price)) |>
arrange(desc(mean_price))
## # A tibble: 32 × 2
## neighbourhood mean_price
## <chr> <dbl>
## 1 Tribeca 358.
## 2 NoHo 312.
## 3 Flatiron District 307.
## 4 SoHo 296.
## 5 Theater District 282.
## 6 Midtown 276.
## 7 Battery Park City 271.
## 8 Greenwich Village 256.
## 9 Chelsea 255.
## 10 Financial District 250.
## # ℹ 22 more rows
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000,
room_type == "Entire home/apt") |>
mutate(neighbourhood=fct_reorder(neighbourhood,price)) |>
ggplot(aes(x=neighbourhood, y=price)) +
geom_violin() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
Go back to that map
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000) |>
ggplot(aes(x=lat, y=long, color=price)) +
geom_point(alpha=.1)
Leaflet package loaded–interactive map created below (look at link online) –slice giving us first 10 rows bc its a huge dataset –if we change from github doc to html doc we can knit and the leaflet map will show up
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000) |>
slice(1:100) |>
leaflet() |>
addTiles() |>
addCircleMarkers(~lat, ~long, radius=2)
pal <- colorNumeric(c("red", "green", "blue"),
domain = nyc_airbnb$ratings)
## Warning: Unknown or uninitialised column: `ratings`.
nyc_airbnb |>
filter(neighbourhood_group=="Manhattan",
price<1000) |>
leaflet() |>
addProviderTiles(providers$CartoDB.Positron) |>
addCircleMarkers(~lat, ~long, color = ~pal(price), radius=2)